import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
vehicle = pd.read_csv("C:/Users/HP/Downloads/vehicle-1.csv")
vehicle
#column description
vehicle.columns
vehicle.head(10)
vehicle.shape
vehicle.dtypes
#5 number summary
vehicle.describe().transpose()
vehicle.isnull().sum()
vehicle['circularity'].fillna(vehicle['circularity'].median(),inplace=True)
vehicle['distance_circularity'].fillna(vehicle['distance_circularity'].median(),inplace=True)
vehicle['radius_ratio'].fillna(vehicle['radius_ratio'].median(),inplace=True)
vehicle['pr.axis_aspect_ratio'].fillna(vehicle['pr.axis_aspect_ratio'].median(),inplace=True)
vehicle['scatter_ratio'].fillna(vehicle['scatter_ratio'].median(),inplace=True)
vehicle['elongatedness'].fillna(vehicle['elongatedness'].median(),inplace=True)
vehicle['pr.axis_rectangularity'].fillna(vehicle['pr.axis_rectangularity'].median(),inplace=True)
vehicle['skewness_about.2'].fillna(vehicle['skewness_about.2'].median(),inplace=True)
vehicle['skewness_about.1'].fillna(vehicle['skewness_about.1'].median(),inplace=True)
vehicle['skewness_about'].fillna(vehicle['skewness_about'].median(),inplace=True)
vehicle['scaled_radius_of_gyration.1'].fillna(vehicle['scaled_radius_of_gyration.1'].median(),inplace=True)
vehicle['scaled_radius_of_gyration'].fillna(vehicle['scaled_radius_of_gyration'].median(),inplace=True)
vehicle['scaled_variance'].fillna(vehicle['scaled_variance'].median(),inplace=True)
vehicle['scaled_variance.1'].fillna(vehicle['scaled_variance.1'].median(),inplace=True)
vehicle.isnull().sum()
sns.boxplot(vehicle['compactness'])
sns.boxplot(vehicle['circularity'])
sns.boxplot(vehicle['distance_circularity'])
sns.boxplot(vehicle['radius_ratio'])
sns.boxplot(vehicle['pr.axis_aspect_ratio'])
sns.boxplot(vehicle['max.length_aspect_ratio'])
sns.boxplot(vehicle['scatter_ratio'])
sns.boxplot(vehicle['elongatedness'])
sns.boxplot(vehicle['pr.axis_rectangularity'])
sns.boxplot(vehicle['max.length_rectangularity'])
sns.boxplot(vehicle['scaled_variance'])
sns.boxplot(vehicle['scaled_variance.1'])
sns.boxplot(vehicle['scaled_radius_of_gyration'])
sns.boxplot(vehicle['scaled_radius_of_gyration.1'])
sns.boxplot(vehicle['skewness_about'])
sns.boxplot(vehicle['skewness_about.1'])
sns.boxplot(vehicle['skewness_about.2'])
sns.boxplot(vehicle['hollows_ratio'])
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'skewness_about': d})
median = df['skewness_about'].median()
std = df['skewness_about'].std()
outliers = (df['skewness_about'] - median).abs() > std
df[outliers] = np.nan
df['skewness_about'].fillna(median, inplace=True)
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'radius_ratio': d})
median = df['radius_ratio'].median()
std = df['radius_ratio'].std()
outliers = (df['radius_ratio'] - median).abs() > std
df[outliers] = np.nan
df['radius_ratio'].fillna(median, inplace=True)
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'pr.axis_aspect_ratio': d})
median = df['pr.axis_aspect_ratio'].median()
std = df['pr.axis_aspect_ratio'].std()
outliers = (df['pr.axis_aspect_ratio'] - median).abs() > std
df[outliers] = np.nan
df['pr.axis_aspect_ratio'].fillna(median, inplace=True)
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'max.length_aspect_ratio': d})
median = df['max.length_aspect_ratio'].median()
std = df['max.length_aspect_ratio'].std()
outliers = (df['max.length_aspect_ratio'] - median).abs() > std
df[outliers] = np.nan
df['max.length_aspect_ratio'].fillna(median, inplace=True)
import random as r
d = [r.random()*1000 for i in range(0,100)]
df = pd.DataFrame({'scaled_radius_of_gyration.1': d})
median = df['scaled_radius_of_gyration.1'].median()
std = df['scaled_radius_of_gyration.1'].std()
outliers = (df['scaled_radius_of_gyration.1'] - median).abs() > std
df[outliers] = np.nan
df['scaled_radius_of_gyration.1'].fillna(median, inplace=True)
vehicle["class"].value_counts(normalize=True)
pd.value_counts(vehicle["class"]).plot(kind="bar")
vehicle.groupby(["class"]).count()
# independant variables
X = vehicle.drop(['class'], axis=1)
# the dependent variable
y = vehicle[['class']]
sns.pairplot(X, diag_kind='kde')
plt.figure(figsize=(15,15))
sns.heatmap(X.corr(),annot=True)
df = vehicle.drop('class', axis=1)
from scipy.stats import zscore
z= df.apply(zscore)
z.head()
from sklearn.model_selection import train_test_split
X,y = np.array(vehicle)[ :, 0:18], np.array(vehicle['class'])[:]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
#Checking the split of vehicle
print("{0:0.2f}% vehicle is in training set".format((len(x_train)/len(vehicle.index)) * 100))
print("{0:0.2f}% vehicle is in test set".format((len(x_test)/len(vehicle.index)) * 100))
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
vehicle['class'] = le.fit_transform(vehicle['class'])
from sklearn import svm
clr = svm.SVC()
clr.fit(x_train , y_train)
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x]== predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
y_pred = clr.predict(x_test)
getAccuracy(y_test, y_pred)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
array = vehicle.values
X2 = array[:,0:18]
Y2 = array[:,18]
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y2, test_size=0.50, random_state=1)
num_folds = 10
seed = 10
kfold = KFold(n_splits=num_folds, random_state=seed)
model = DecisionTreeRegressor()
results = cross_val_score(model, X2, Y2, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
cov = np.cov(z,rowvar=False)
print(cov)
from sklearn.decomposition import PCA
pca = PCA(n_components=18)
pca.fit(z)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(0,18)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(0,18)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca2 = PCA(n_components=6)
pca2.fit(z)
print(pca2.components_)
print(pca2.explained_variance_ratio_)
Xpca = pca2.transform(z)
Xpca
sns.pairplot(pd.DataFrame(Xpca))
X,y = np.array(Xpca)[ :, 0:6], np.array(vehicle['class'])[:] #Note that the X variable contains PCA data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print("{0:0.2f}% vehicle is in training set".format((len(x_train)/len(vehicle.index)) * 100))
print("{0:0.2f}% vehicle is in test set".format((len(x_test)/len(vehicle.index)) * 100))
from sklearn import svm
clf = svm.SVC ()
clf.fit(x_train , y_train)
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x]== predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
y_pred = clf.predict(x_test)
getAccuracy(y_test , y_pred)
from sklearn import metrics
import seaborn as sns
cm=metrics.confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm)
plt.figure(figsize = (5,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
regression_model = DecisionTreeRegressor()
results = cross_val_score(regression_model, X, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))